Load Data

metabolic_data = read.csv('Metabolism Data Prob 26.csv')
metabolic_data$Mass_mod = metabolic_data$Mass^(0.75) 
summary(metabolic_data)
##                 CommonName                       Species  
##  Mouse               : 4   Antechinomus_laniger      : 1  
##  Rat                 : 4   Antechinomus_stuartii     : 1  
##  Woodrat             : 4   Antechinus_macdonnellensis: 1  
##  Rodent              : 3   Antilocapra_americana     : 1  
##  Australian_marsupial: 2   Blarina_brevicauda        : 1  
##  Bat                 : 2   Bradypus_variegatus       : 1  
##  (Other)             :76   (Other)                   :89  
##       Mass               Metab                Life         Mass_mod       
##  Min.   :   0.0036   Min.   :     5.17   Min.   : 0.8   Min.   :  0.0147  
##  1st Qu.:   0.0990   1st Qu.:    39.05   1st Qu.: 3.5   1st Qu.:  0.1763  
##  Median :   1.9800   Median :   320.00   Median : 8.0   Median :  1.6692  
##  Mean   :  69.6215   Mean   :  5173.49   Mean   :12.3   Mean   : 14.3155  
##  3rd Qu.:  11.9000   3rd Qu.:  1695.00   3rd Qu.:15.5   3rd Qu.:  6.4044  
##  Max.   :3000.0000   Max.   :165000.00   Max.   :75.0   Max.   :405.3600  
## 

Question 1: Metabolic Rate vs Mass

Assess relationship between mass metabolic rate:

{meta|mass}: meta = B0 + B1(mass)

or

{meta|mass_.75}: meta = B0 + B1(mass)

ggplot(data=metabolic_data) + 
  geom_point(aes(x=Mass, y=Metab), color='Red') +
  geom_point(aes(x=Mass_mod, y=Metab), color='Blue') 

Data does appear to be linear for both inputs (mass and mass^0.75)

Trend is stronger for X^3/4
mass_model = lm(data=metabolic_data, formula = Metab ~ Mass_mod)
summary(mass_model)
## 
## Call:
## lm(formula = Metab ~ Mass_mod, data = metabolic_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11712.7   -117.4    368.5    474.3   6598.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -481.346    213.467  -2.255   0.0265 *  
## Mass_mod     395.016      4.299  91.895   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1992 on 93 degrees of freedom
## Multiple R-squared:  0.9891, Adjusted R-squared:  0.989 
## F-statistic:  8445 on 1 and 93 DF,  p-value: < 2.2e-16

Plot CL

bird_plot = ggplot(metabolic_data, aes(x=Mass_mod, y=Metab)) +
  geom_point(color='Blue', size = 2) +
  geom_smooth(method = lm, se=TRUE, level=0.99, color='Red')

predictions_cl = predict(mass_model, newdata = data.frame(Mass_mod = sort(metabolic_data$Mass_mod)), interval = c('predict'), type=c('response'), level=0.99)

regress_cl = predict(mass_model, newdata = data.frame(Mass_mod = sort(metabolic_data$Mass_mod)), interval = c('confidence'), type=c('response'), level=0.99)

bird_plot +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(metabolic_data$Mass_mod), y=upr), color='Green') +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(metabolic_data$Mass_mod), y=lwr), color='Green') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(metabolic_data$Mass_mod), y=upr), color='Blue') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(metabolic_data$Mass_mod), y=lwr), color='Blue')

plot(mass_model)

Try log transform

log_data = metabolic_data
log_data$Meta_log = log(log_data$Metab)
log_data$Mass_mod_log = log(log_data$Mass_mod)
ggplot(data=log_data) + 
  geom_point(aes(x=Mass_mod_log, y=Meta_log), color='Red') +
  geom_point(aes(x=Mass_mod_log, y=Meta_log), color='Blue') 

log_model = lm(data=log_data, formula = Meta_log ~ Mass_mod_log)

bird_plot = ggplot(log_data, aes(x=Mass_mod_log, y=Meta_log)) +
  geom_point(color='Blue', size = 2) +
  geom_smooth(method = lm, se=TRUE, level=0.99, color='Red')

predictions_cl = predict(log_model, newdata = data.frame(Mass_mod_log = sort(log_data$Mass_mod_log)), interval = c('predict'), type=c('response'), level=0.99)

regress_cl = predict(log_model, newdata = data.frame(Mass_mod_log = sort(log_data$Mass_mod_log)), interval = c('confidence'), type=c('response'), level=0.99)

bird_plot +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(log_data$Mass_mod_log), y=upr), color='Green') +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(log_data$Mass_mod_log), y=lwr), color='Green') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(log_data$Mass_mod_log), y=upr), color='Blue') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(log_data$Mass_mod_log), y=lwr), color='Blue')

m<-mean(log_model$residuals)
std<-sqrt(var(log_model$residuals))
hist(log_model$residuals, breaks=20, prob=TRUE, 
     xlab="x-variable", ylim=c(0, 2), 
     main="normal curve over histogram") 
curve(dnorm(x, mean=m, sd=std), 
      col="darkblue", lwd=2, add=TRUE, yaxt="n")

plot(log_model)

Much Better to use log/log

Meets assumptions for normality, linearity, equal variance and independence

Outliers are not an issue according to Cookd measures

summary(log_model)
## 
## Call:
## lm(formula = Meta_log ~ Mass_mod_log, data = log_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14216 -0.26466 -0.04889  0.25308  1.37616 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.63833    0.04709  119.73   <2e-16 ***
## Mass_mod_log  0.98499    0.01949   50.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4572 on 93 degrees of freedom
## Multiple R-squared:  0.9649, Adjusted R-squared:  0.9645 
## F-statistic:  2553 on 1 and 93 DF,  p-value: < 2.2e-16
confint(log_model)
##                  2.5 %   97.5 %
## (Intercept)  5.5448128 5.731849
## Mass_mod_log 0.9462828 1.023700

Regression Equation

{log(Meta):log(Mass)}: B0 + B1log(Mass) = 5.64 + 0.98log(Mass)

Interpretation

A doubling of the mass corresponds to a 98% increase in the expected median of Metabolic activity

A 95% confidence interval for this effect is [94%, 102%]

96% of the variability is explained by the model

Question 2

autism_data = read.csv('Autism Data Prob 29.csv')
summary(autism_data)
##       Year        Prevalence   
##  Min.   :1992   Min.   : 3.50  
##  1st Qu.:1994   1st Qu.: 5.30  
##  Median :1996   Median : 7.80  
##  Mean   :1996   Mean   : 9.34  
##  3rd Qu.:1998   3rd Qu.:11.80  
##  Max.   :2000   Max.   :18.30
autism_model = lm(data=autism_data, formula = Prevalence ~ Year)
summary(autism_model)
## 
## Call:
## lm(formula = Prevalence ~ Year, data = autism_data)
## 
## Residuals:
##     1     2     3     4     5 
##  1.38 -0.43 -1.54 -1.15  1.74 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -3593.440    540.858  -6.644  0.00695 **
## Year            1.805      0.271   6.661  0.00690 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.714 on 3 degrees of freedom
## Multiple R-squared:  0.9367, Adjusted R-squared:  0.9156 
## F-statistic: 44.37 on 1 and 3 DF,  p-value: 0.006897
plot(autism_model)

ggplot(data=autism_data) + 
  geom_point(aes(x=Year, y=Prevalence), color='Red') +
  geom_point(aes(x=Year, y=Prevalence), color='Blue') 

autism_model = lm(data=autism_data, formula = Prevalence ~ Year)

bird_plot = ggplot(autism_data, aes(x=Year, y=Prevalence)) +
  geom_point(color='Blue', size = 2) +
  geom_smooth(method = lm, se=TRUE, level=0.99, color='Red')

predictions_cl = predict(autism_model, newdata = data.frame(Year = sort(autism_data$Year)), interval = c('predict'), type=c('response'), level=0.99)

regress_cl = predict(autism_model, newdata = data.frame(Year = sort(autism_data$Year)), interval = c('confidence'), type=c('response'), level=0.99)

bird_plot +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(autism_data$Year), y=upr), color='Green') +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(autism_data$Year), y=lwr), color='Green') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(autism_data$Year), y=upr), color='Blue') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(autism_data$Year), y=lwr), color='Blue')

m<-mean(autism_model$residuals)
std<-sqrt(var(autism_model$residuals))
hist(autism_model$residuals, breaks=20, prob=TRUE, 
     xlab="x-variable", ylim=c(0, 2), 
     main="normal curve over histogram")
curve(dnorm(x, mean=m, sd=std), 
      col="darkblue", lwd=2, add=TRUE, yaxt="n")

plot(autism_model)

This model is trash - transformers GO!

Log Linear
log_data = autism_data
log_data$Prevalence_log = log(log_data$Prevalence)

autism_model = lm(data=log_data, formula = Prevalence_log ~ Year)
summary(autism_model)
## 
## Call:
## lm(formula = Prevalence_log ~ Year, data = log_data)
## 
## Residuals:
##         1         2         3         4         5 
##  0.004578  0.008655 -0.015795 -0.012686  0.015248 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.080e+02  4.953e+00  -82.38 3.94e-06 ***
## Year         2.054e-01  2.481e-03   82.79 3.88e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01569 on 3 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9994 
## F-statistic:  6855 on 1 and 3 DF,  p-value: 3.884e-06
plot(autism_model)

ggplot(data=log_data) + 
  geom_point(aes(x=Year, y=Prevalence_log), color='Red') +
  geom_point(aes(x=Year, y=Prevalence_log), color='Blue') 

autism_model = lm(data=log_data, formula = Prevalence_log ~ Year)

bird_plot = ggplot(log_data, aes(x=Year, y=Prevalence_log)) +
  geom_point(color='Blue', size = 2) +
  geom_smooth(method = lm, se=TRUE, level=0.99, color='Red')

predictions_cl = predict(autism_model, newdata = data.frame(Year = sort(log_data$Year)), interval = c('predict'), type=c('response'), level=0.99)

regress_cl = predict(autism_model, newdata = data.frame(Year = sort(log_data$Year)), interval = c('confidence'), type=c('response'), level=0.99)

bird_plot +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(log_data$Year), y=upr), color='Green') +
  geom_line(data=data.frame(predictions_cl), aes(x=sort(log_data$Year), y=lwr), color='Green') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(log_data$Year), y=upr), color='Blue') +
  geom_line(data=data.frame(regress_cl), aes(x=sort(log_data$Year), y=lwr), color='Blue')

m<-mean(autism_model$residuals)
std<-sqrt(var(autism_model$residuals))
hist(autism_model$residuals, freq = TRUE, breaks=20) 
curve(dnorm(x, mean=m, sd=std), 
      col="darkblue", lwd=2, add=TRUE, yaxt="n")

plot(autism_model)

Much better!

Still very hard to run a regression for such a small sample - but that’s the assigment… sooooo

Meets assumptions for normality, linearity, equal variance and independence

Outliers are not an issue according to Cookd measures

Regression Equation

{log(Prevalence):Year)}: B0 + B1Year = -408 + 0.020Year

Interpretation

A multiplicative effect of year on prevalance of 23% is predicted by the model

A 95% confidence interval for this effect is [22%, 24%]

99% of the variability is explained by the model

(note that there is no reason to suspect that year should be a part of this model, model is nonsense)